Image Similarity¶

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm.notebook import tqdm
from scipy.spatial.distance import cosine, chebyshev, canberra, braycurtis, euclidean
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import seaborn as sns
from matplotlib import rcParams
import pickle
In [3]:
df = pd.read_csv('/home/jupyter/REPO EXPLO/Image_similarity/df_unified.csv').drop("Unnamed: 0", axis=1)
df.head()
Out[3]:
path_x numint path_y nom_ls gs_uri label lab_num
0 2023-03-10#pto#avant#0000019200133#565aa5aa477... 19200133 2023-03-10#pto#apres#0000019200133#7d8f75e40f4... 19200133_4298 gs://ofr-vqi-data-pipeline-models-dev/notebook... Same 0
1 2023-03-13#pto#avant#0000021156692#4b2aa6c3942... 21156692 2023-03-13#pto#apres#0000021156692#134ff6b528e... 21156692_5026 gs://ofr-vqi-data-pipeline-models-dev/notebook... Not enough context elements 0
2 2023-03-10#pto#avant#0000017998364#0dfdd44fd1d... 17998364 2023-03-10#pto#apres#0000017998364#0a059bc7f01... 17998364_3768 gs://ofr-vqi-data-pipeline-models-dev/notebook... Same 0
3 2023-03-10#pto#avant#0000022631562#18983b6cc91... 22631562 2023-03-10#pto#apres#0000022631562#5ae856a5232... 22631562_5294 gs://ofr-vqi-data-pipeline-models-dev/notebook... Same 0
4 2023-03-13#pto#avant#0000023208443#279f5755fb3... 23208443 2023-03-13#pto#apres#0000023208443#081a25ab7af... 23208443_5745 gs://ofr-vqi-data-pipeline-models-dev/notebook... Same 0
In [4]:
len(df)
Out[4]:
9705
In [5]:
df["label"].value_counts()
Out[5]:
Same                           8190
Not enough context elements     838
Not same                        677
Name: label, dtype: int64

Visualisation¶

In [6]:
def load_image_viz(path1, path2):
    img1 = cv2.cvtColor(cv2.imread("/home/jupyter/Notebooks/folder_20k/imgs_paire_pto_10k/"+path1), cv2.COLOR_BGR2RGB)
    img2 = cv2.cvtColor(cv2.imread("/home/jupyter/Notebooks/folder_20k/imgs_paire_pto_10k/"+path2), cv2.COLOR_BGR2RGB)
    img1 = cv2.resize(img1, (600,600))
    img2 = cv2.resize(img2, (600,600))
    h1, w1, _ = img1.shape
    h2, w2, _ = img2.shape
    height = max(h1, h2)
    width = w1 + w2 + 25
    result = np.ones((height, width, 3), dtype=np.uint8)*255
    result[0:h1, 0:w1, :] = img1
    result[0:h2, w1+25:w1 + w2 +25, :] = img2
    return result
In [13]:
y = ("Same","Not same","Not enough context elements","Not enough context elements")
titles = ("Same", "Not same", "Not enough context elements exemple 1", "Not enough context elements exemple 2")
#Same : 6, 28, 35, 42, 63 ; Not same : 3, 8, 20 ; Not enough context elements : 0, 21, 22
idx = (42,8,6,111)
z = 0
fig, axs = plt.subplots(2, 2, figsize=(12, 8))

for i in [0,1]:
    for j in [0,1]:
        if i == 1 and j == 1:
            axs[i, j].imshow(load_image_viz(df[df["label"].str.contains(y[z])].reset_index()['path_x'][idx[z]],
                                            df[df["label"].str.contains(y[z])].reset_index()['path_y'][idx[z]]))
            axs[i, j].set_title(titles[z])
            axs[i, j].axis("off")
            z+=1
        else:
            axs[i, j].imshow(load_image_viz(df[df["label"].str.contains(y[z])].reset_index()['path_x'][idx[z]], df[df["label"].str.contains(y[z])].reset_index()['path_y'][idx[z]]))
            axs[i, j].set_title(titles[z])
            axs[i, j].axis("off")
            z+=1
            
fig.tight_layout()
In [14]:
y = ("Not enough context elements","Not enough context elements","Not enough context elements","Not enough context elements")
titles = ("Not enough context elements", "Not enough context elements", "Not enough context elements", "Not enough context elements")
#Same : 6, 28, 35, 42, 63 ; Not same : 3, 8, 20 ; Not enough context elements : 0, 21, 22
idx = (71,28,89,114)
z = 0
fig, axs = plt.subplots(2, 2, figsize=(12, 8))

for i in [0,1]:
    for j in [0,1]:
        if i == 1 and j == 1:
            axs[i, j].imshow(load_image_viz(df[df["label"].str.contains(y[z])].reset_index()['path_x'][idx[z]],
                                            df[df["label"].str.contains(y[z])].reset_index()['path_y'][idx[z]]))
            axs[i, j].set_title(titles[z])
            axs[i, j].axis("off")
            z+=1
        else:
            axs[i, j].imshow(load_image_viz(df[df["label"].str.contains(y[z])].reset_index()['path_x'][idx[z]],
                                            df[df["label"].str.contains(y[z])].reset_index()['path_y'][idx[z]]))
            axs[i, j].set_title(titles[z])
            axs[i, j].axis("off")
            z+=1
            
fig.tight_layout()

Image Similarity¶

Architecture¶

In [17]:
Image.open("/home/jupyter/REPO EXPLO/Image_similarity/archi_dist.png")
Out[17]:

Code¶

In [26]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

resnet = models.resnet152(pretrained=True)

output = nn.Sequential(*list(resnet.children())[:-1])

output.to(device)

def load_image_torch(path):
    img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
    x = cv2.resize(img, (224, 224))
    x = cv2.normalize(x.astype('float'), None, 0.0, 1.0, cv2.NORM_MINMAX)
    #x = np.expand_dims(x, axis = 0)
    return x

def prediction_avant(i):
    with torch.no_grad():
        img = load_image_torch("/home/jupyter/Notebooks/folder_20k/imgs_paire_pto_10k/"+df["path_x"].iloc[i])
        image_tensor = transform(img).unsqueeze(0)
        features = output(image_tensor.float().to(device)).squeeze().tolist()
    return features

def prediction_apres(i):
    with torch.no_grad():
        img = load_image_torch("/home/jupyter/Notebooks/folder_20k/imgs_paire_pto_10k/"+df["path_y"].iloc[i])
        image_tensor = transform(img).unsqueeze(0)
        features = output(image_tensor.float().to(device)).squeeze().tolist()
    return features

transform = transforms.Compose([
        transforms.ToTensor()
    ])
In [ ]:
dist_cos = []
dist_euc = []
dist_cheb = []
dist_can = []
dist_bray = []
for i in tqdm(range(len(df))):
    pred1 = prediction_avant(i)
    pred2 = prediction_apres(i)
    
    dist_cos.append(cosine(pred1, pred2))
    dist_euc.append(braycurtis(pred1, pred2))
    dist_cheb.append(chebyshev(pred1, pred2))
    dist_can.append(canberra(pred1, pred2))
    dist_bray.append(euclidean(pred1, pred2))

DF_DIST = pd.DataFrame({"cosine":dist_cos, "braycurtis":dist_bray , "chebychev":dist_cheb,
                        "canberra":dist_can, "euclidean":dist_euc, "label":df["lab_num"].tolist()})
In [ ]:
scaler = MinMaxScaler()
DF_DIST[["canberra","braycurtis"]] = scaler.fit_transform(DF_DIST[["canberra","braycurtis"]])
In [ ]:
#DF_DIST.to_csv("REPO EXPLO/Image_similarity/DF_DIST.csv")

Résultats¶

In [32]:
df_dist = pd.read_csv("/home/jupyter/REPO EXPLO/Image_similarity/DF_DIST.csv")
df_dist = df_dist.drop(["Unnamed: 0","chebyshev","cosine"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(df_dist.drop(["lab_num"], axis=1),
                                                    df_dist["lab_num"],
                                                    test_size=0.3,
                                                    stratify= df_dist["lab_num"],
                                                    random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
In [ ]:
clf = SVC(C=10, kernel="rbf", tol=0.0001, max_iter= 1000)

clf.fit(X_train_resampled, y_train_resampled)

y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm = np.round(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], 2)
rcParams['figure.figsize'] = 8 ,5
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=["Same", "Not Same"], yticklabels=["Same", "Not Same"])
Out[5]:
<Axes: >
In [ ]:
# with open('/home/jupyter/REPO EXPLO/Image_similarity/SVC_FINAL.pkl', 'wb') as f:
#     pickle.dump(clf, f)